{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#params\n",
    "DOCS_FOLDER = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/ProcessedDocs\"\n",
    "FILE_MASK = \".*\\.txt\"\n",
    "\n",
    "MIN_DOC_FREQ = 30\n",
    "MAX_PHRASE_LEN = 4\n",
    "STOP_WORDS_FILE = \"/Users/simon.hughes/GitHub/analytics-python/SolrlikeAnalysisPipeline/data/dice_stop_words.txt\"\n",
    "PHRASES_FILE    = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/Phrases.txt\"\n",
    "PHRASE_FREQ_FILE= '/Users/simon.hughes/Documents/Dice Data/LuceneTalk/phrases_freq.txt'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import time\n",
    "full_start = time.time()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import re\n",
    "import numpy as np\n",
    "import time\n",
    "import hashlib\n",
    "from collections import defaultdict\n",
    "\n",
    "re_collapse_spaces = re.compile(\"\\s+\")\n",
    "\n",
    "def collapse_spaces(s):\n",
    "    return re_collapse_spaces.sub(\" \", s).strip()\n",
    "\n",
    "re1 = re.compile(\"[;:\\'\\\"\\*/\\),\\(\\|\\s]+\")\n",
    "def clean_str(s):\n",
    "    s = str(s).replace(\"'s\",\" \")\n",
    "    #doesn't work in regex\n",
    "    s = s.replace(\"-\", \" \").replace(\"\\\\\",\" \")\n",
    "    s = re1.sub(\" \",s).strip()\n",
    "    return collapse_spaces(s)\n",
    "\n",
    "def compute_ngrams(tokens, max_len = None, min_len = 1):\n",
    "    \"\"\"\n",
    "    tokens  :   iterable of string\n",
    "                    a single sentence of tokens. Assumes start and stop tokens omitted\n",
    "    max_len :   int\n",
    "                    maximum ngram length\n",
    "    min_len :   int\n",
    "                    minimum ngram length\n",
    "\n",
    "    \"\"\"\n",
    "    if max_len == None:\n",
    "        max_len = len(tokens)\n",
    "\n",
    "    if min_len > max_len:\n",
    "        raise Exception(\"min_len cannot be more than max_len\")\n",
    "\n",
    "    ngrams = set()\n",
    "    # unigrams\n",
    "    for ngram_size in range(min_len, max_len + 1):\n",
    "        for start in range(0, len(tokens) - ngram_size + 1):\n",
    "            end = start + ngram_size -1\n",
    "            words = []\n",
    "            for i in range(start, end + 1):\n",
    "                words.append(tokens[i])\n",
    "            ngrams.add(tuple(words)) # make a tuple so hashable\n",
    "    return ngrams\n",
    "\n",
    "# is a valid token\n",
    "__bad_chars__ = \"<>{}[]~@\"\n",
    "__punct__ = set(\".?!,;:\")\n",
    "def is_valid_term(term):\n",
    "    # remove single char entries and only numeric\n",
    "    if len(term) == 0:\n",
    "        return False\n",
    "    if len(term) == 1:\n",
    "        #else misses c and r\n",
    "        if term.isalpha():\n",
    "            return True\n",
    "        return False\n",
    "    # no html or js terms\n",
    "    for c in __bad_chars__:\n",
    "        if c in term:\n",
    "            return False\n",
    "    if term[-1] in __punct__:\n",
    "        return False\n",
    "    if \"function(\" in term:\n",
    "        return False\n",
    "    if \"!\" in term or \"?\" in term:\n",
    "        return False\n",
    "    digit_chars = 0.0\n",
    "    for c in term:\n",
    "        if c.isdigit() or not c.isalnum():\n",
    "            digit_chars += 1.0\n",
    "    # 60% digits?\n",
    "    if (digit_chars / len(term)) >= 0.75:\n",
    "        return False\n",
    "    return True\n",
    "\n",
    "re1 = re.compile(\"[;:\\'\\\"\\*/\\),\\(\\-\\|\\s]+\")\n",
    "\n",
    "# we may want to keep some non-alpha characters, such as # in C# and + in C++, etc.\n",
    "def remove_punct(s):\n",
    "    s = s.replace(\"'s\",\" \")\n",
    "    return collapse_spaces(re1.sub(\" \",s).strip())\n",
    "\n",
    "def hash_string(s):\n",
    "    hash_object = hashlib.md5(b'%s' % s)\n",
    "    return str(hash_object.hexdigest())\n",
    "\n",
    "def find_files(folder, regex, remove_empty = False):\n",
    "    \"\"\"\n",
    "    Find all files matching the [regex] pattern in [folder]\n",
    "\n",
    "    folder  :   string\n",
    "                    folder to search (not recursive)\n",
    "    regex   :   string (NOT regex object)\n",
    "                    pattern to match\n",
    "    \"\"\"\n",
    "    files = os.listdir(folder)\n",
    "    matches = [os.path.abspath(os.path.join(folder, f))\n",
    "               for f in files\n",
    "               if re.search(regex, f, re.IGNORECASE)]\n",
    "\n",
    "    if remove_empty:\n",
    "        matches = [f for f in matches if os.path.getsize(f) > 0]\n",
    "    matches.sort()\n",
    "    return matches"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load Stop Words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1445 stop words loaded\n"
     ]
    }
   ],
   "source": [
    "# load stop words\n",
    "def load_stop_words(stop_words_file):\n",
    "    stop_words = set()\n",
    "    with open(stop_words_file) as f:\n",
    "            for line in f:\n",
    "                word = line.strip()\n",
    "                if word[0] != \"#\":\n",
    "                    word = word.lower()\n",
    "                    stop_words.add(word)\n",
    "    return stop_words\n",
    "\n",
    "if STOP_WORDS_FILE:\n",
    "    stop_words = load_stop_words(STOP_WORDS_FILE)\n",
    "    print(\"%i stop words loaded\" % len(stop_words))\n",
    "else:\n",
    "    stop_words = set()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "66989 files found in /Users/simon.hughes/Documents/Dice Data/LuceneTalk/ProcessedDocs\n",
      "Loading 66989 documents took 14.0081241131 seconds\n"
     ]
    }
   ],
   "source": [
    "import os, re\n",
    "start = time.time()\n",
    "\n",
    "files = find_files(DOCS_FOLDER, FILE_MASK, True)\n",
    "print(\"%s files found in %s\" % (len(files), DOCS_FOLDER))\n",
    "documents = []\n",
    "for i, fname in enumerate(files):\n",
    "    with open(fname) as f:\n",
    "        contents = f.read()\n",
    "        documents.append(contents.split(\"\\n\"))\n",
    "end = time.time()\n",
    "print(\"Loading %i documents took %s seconds\" % (len(files), str(end - start)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extract Common Terms and Phrases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "55.7478640079 secs\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(62696, 'docs', 137661, 'unique tokens')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "start = time.time()\n",
    "#Or use a counter here.\n",
    "doc_freq = defaultdict(int)\n",
    "\n",
    "# remove short docs\n",
    "tokenized_docs = []\n",
    "sent_id = 0\n",
    "sent_ids = set()\n",
    "lens = []\n",
    "hashed = set()\n",
    "\n",
    "for doc in documents:\n",
    "    un_tokens = set()\n",
    "    tok_sents = []\n",
    "    for sent in doc:\n",
    "        cl_sent = remove_punct(sent.lower())\n",
    "        hash_sent = hash_string(cl_sent)\n",
    "        # remove dupe sentences (not - will hurt df accuracy a little)\n",
    "        if hash_sent in hashed:\n",
    "            continue\n",
    "        hashed.add(hash_sent)\n",
    "\n",
    "        tokens = tuple(cl_sent.split(\" \"))\n",
    "        lens.append(len(tokens))\n",
    "        sent_id += 1\n",
    "        tok_sents.append((sent_id, tokens))\n",
    "        sent_ids.add(sent_id)\n",
    "        \n",
    "        # create inverted index and unique tokens (for doc freq calc)\n",
    "        proc_tokens = set()\n",
    "        for tok in tokens:\n",
    "            if not tok in proc_tokens:\n",
    "                proc_tokens.add(tok)\n",
    "                if not tok in un_tokens:\n",
    "                    un_tokens.add(tok)                    \n",
    "                    doc_freq[tok] += 1\n",
    "        \n",
    "    if len(tok_sents) > 0:\n",
    "        tokenized_docs.append(tok_sents)\n",
    "\n",
    "end = time.time()\n",
    "print end-start, \"secs\"\n",
    "len(tokenized_docs), \"docs\", len(doc_freq), \"unique tokens\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8455 frequent terms identified for building phrases\n"
     ]
    }
   ],
   "source": [
    "# Get really frequent items for removal\n",
    "num_docs = float(len(tokenized_docs))\n",
    "above_threshold = [k for k,v in doc_freq.items() if v >= MIN_DOC_FREQ]\n",
    "\n",
    "# remove really frequent terms (in 50% or more of documents)\n",
    "too_freq = set([k for k in above_threshold if (doc_freq[k]/num_docs) >= 0.50])\n",
    "\n",
    "freq_terms = [k for k in above_threshold \n",
    "              if  k not in stop_words and \n",
    "                  k not in too_freq and \n",
    "                  is_valid_term(k)]\n",
    "print(\"%s frequent terms identified for building phrases\" % len(freq_terms))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "phrase_len 2\n",
      "62696 docs 8455 terms 1213706 sentences\n",
      "\t22957 phrases found\n",
      "\ttook 33.1819739342 seconds\n",
      "phrase_len 3\n",
      "61803 docs 3130 terms 772061 sentences\n",
      "\t25250 phrases found\n",
      "\ttook 14.6069819927 seconds\n",
      "phrase_len 4\n",
      "59772 docs 1066 terms 132253 sentences\n",
      "\t25579 phrases found\n",
      "\ttook 3.33791399002 seconds\n",
      "\n",
      "took 56.7129700184 seconds\n",
      "25579 phrases found\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "\n",
    "start = time.time()\n",
    "\n",
    "# Find all phrases up to length MAX_PHRASE_LEN at or above the defined MIN_DOC_FREQ above\n",
    "phrase_doc_freq = defaultdict(int)\n",
    "for term in freq_terms:\n",
    "    phrase_doc_freq[tuple([term])] = doc_freq[term]\n",
    "    \n",
    "# data structure is a list of list (document) of pairs - sentences: (int, list  (of tokens))\n",
    "# each item is a doc, a list of sents. each sent is a list of valid remaining phrases\n",
    "# seed with one valid phrase per sent\n",
    "\n",
    "#working_docs = [map(lambda sent: [sent], d) for d in tokenized_docs]\n",
    "working_docs = [map(lambda (sid, sent): (sid, [sent]), d) for d in tokenized_docs]\n",
    "working_freq_terms = set(freq_terms)\n",
    "\n",
    "# sentences with one or more phrases that are frequent enough (under the apriori algorithm closure priniciple)\n",
    "working_sent_ids = set(sent_ids)\n",
    "# don't bother whitling this down further at the start, almost all sentences have at least on freq term in them\n",
    "\n",
    "for phrase_len in range(2, MAX_PHRASE_LEN + 1):\n",
    "    phrase_start = time.time()\n",
    "    print \"phrase_len\", phrase_len\n",
    "    print len(working_docs), \"docs\", len(working_freq_terms), \"terms\", len(working_sent_ids), \"sentences\"\n",
    "    # for current phrase_len\n",
    "    current_phrase_doc_freq = defaultdict(int)\n",
    "    \n",
    "    # used to look up sentence ids by phrase\n",
    "    phrase2sentids = defaultdict(set)\n",
    "    \n",
    "    new_work_docs = []\n",
    "    for doc in working_docs:\n",
    "        new_work_sents = []\n",
    "        unique_potential_phrases = set()\n",
    "        for sent_id, phrases in doc:\n",
    "            if sent_id not in working_sent_ids:\n",
    "                continue\n",
    "            \n",
    "            new_work_phrases = []\n",
    "            for phrase in phrases:\n",
    "                current_phrase = []\n",
    "                for term in phrase:\n",
    "                    if term in working_freq_terms:\n",
    "                        current_phrase.append(term)\n",
    "                    else:\n",
    "                        if len(current_phrase) >= phrase_len:\n",
    "                            new_work_phrases.append(current_phrase)\n",
    "                        current_phrase = []\n",
    "                \n",
    "                if len(current_phrase) >= phrase_len:\n",
    "                    new_work_phrases.append(current_phrase)\n",
    "            \n",
    "            if len(new_work_phrases) > 0:\n",
    "                for phrase in new_work_phrases:\n",
    "                    ngrams = compute_ngrams(phrase, phrase_len, phrase_len)\n",
    "                    for tpl_ngram in ngrams:                        \n",
    "                        unique_potential_phrases.add(tpl_ngram)\n",
    "                        phrase2sentids[tpl_ngram].add(sent_id)\n",
    "                        \n",
    "                new_work_sents.append((sent_id, new_work_phrases))\n",
    "        \n",
    "        # end for sent in doc\n",
    "        # for doc freq, need to compute unique phrases in document\n",
    "        for unique_tpl_phrase in unique_potential_phrases:\n",
    "            current_phrase_doc_freq[unique_tpl_phrase] +=1\n",
    "        \n",
    "        if len(new_work_sents) > 0:\n",
    "            new_work_docs.append(new_work_sents)\n",
    "\n",
    "    new_working_freq_terms = set()\n",
    "    new_working_sent_ids = set()\n",
    "    for tuple_phrase, freq in current_phrase_doc_freq.items():\n",
    "        if freq < MIN_DOC_FREQ:\n",
    "            continue            \n",
    "        phrase_doc_freq[tuple_phrase] = freq\n",
    "        new_working_sent_ids.update(phrase2sentids[tuple_phrase])\n",
    "        for tok in tuple_phrase:\n",
    "            new_working_freq_terms.add(tok)\n",
    "    \n",
    "    if len(new_working_freq_terms) <= 1 or len(new_work_docs) == 0 or len(new_working_sent_ids) == 0:\n",
    "        break\n",
    "    working_docs = new_work_docs\n",
    "    working_freq_terms = new_working_freq_terms\n",
    "    working_sent_ids = new_working_sent_ids\n",
    "    phrase_end = time.time()\n",
    "    print \"\\t\", len(phrase_doc_freq), \"phrases found\"\n",
    "    print \"\\ttook\", phrase_end - phrase_start, \"seconds\"\n",
    "    \n",
    "print \"\"\n",
    "end = time.time()\n",
    "print \"took\", end - start, \"seconds\"\n",
    "print len(phrase_doc_freq), \"phrases found\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Find Sub-Phrases that Are Of Near Equivalent Frequencies and REMOVE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "793"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# there are a lot of short phrases that always or nearly always have the same DF as longer phrases that they constitute\n",
    "\n",
    "def find_sub_phrases_to_remove(tpl_phrase, valid_phrases, doc_freq, to_rem):\n",
    "    if len(tpl_phrase) <= 1:\n",
    "        return\n",
    "    phrase_df = doc_freq[tpl_phrase]\n",
    "    ngrams = compute_ngrams(tpl_phrase, len(tpl_phrase)-1, 1)\n",
    "    for tpl_ngram in ngrams:\n",
    "        if tpl_ngram in valid_phrases and tpl_ngram not in to_rem:\n",
    "            sub_phr_df = doc_freq[tpl_ngram]\n",
    "            # if sub_phrase_df is close to the same frequency\n",
    "            if phrase_df >= (0.9 * sub_phr_df):\n",
    "                to_rem.add(tpl_ngram)\n",
    "                #to_rem_dbg.add((tpl_phrase, tpl_ngram, phrase_df, sub_phr_df))\n",
    "                find_sub_phrases_to_remove(tpl_ngram, valid_phrases, doc_freq, to_rem)\n",
    "\n",
    "# don't process unigrams\n",
    "valid_phrases = set(phrase_doc_freq.keys())\n",
    "phrases = filter(lambda k: len(k) > 1, phrase_doc_freq.keys())\n",
    "to_remove = set()\n",
    "\n",
    "for tpl_key in sorted(phrases, key = lambda k: -len(k)):\n",
    "    if tpl_key not in to_remove:\n",
    "        phrase_df = phrase_doc_freq[tpl_key]\n",
    "        # find all shorter sub-phrases\n",
    "        find_sub_phrases_to_remove(tpl_key, valid_phrases, phrase_doc_freq, to_remove)\n",
    "\n",
    "len(to_remove)#, len(to_remove_dbg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Whole process took 127.771396875 seconds\n"
     ]
    }
   ],
   "source": [
    "#Dump phrases to file\n",
    "with open(PHRASES_FILE, \"w+\") as f:\n",
    "    for tpl in sorted(phrase_doc_freq.keys()):\n",
    "        # phrases only\n",
    "        if tpl not in to_remove:\n",
    "            joined = \" \".join(tpl)\n",
    "            f.write(joined + \"\\n\")\n",
    "    \n",
    "full_end = time.time()\n",
    "print(\"Whole process took %s seconds\" % str(full_end - full_start))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with open(PHRASE_FREQ_FILE, \"w+\") as f:\n",
    "    for tpl,v in sorted(phrase_doc_freq.items(), key = lambda (k,v): -v):\n",
    "        # phrases only\n",
    "        if tpl not in to_remove:\n",
    "            joined = \" \".join(tpl)\n",
    "            f.write(joined + \",\" + str(v) + \"\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
